# DESCRIPTIVE STATISTICS

# Install and load packages if not already done

packages <- c("plyr", "ggplot2", "zoo", "scales", "stringr", "dplyr", "tidyverse", "text2vec", 
              "lubridate", "data.table", "quanteda", "quanteda.textstats", "magrittr", 
              "fixest", "modelsummary", "marginaleffects", "texreg")
install_and_load <- function(package) {
  if (!require(package, character.only = TRUE)) {
    install.packages(package, dependencies = TRUE)
    library(package, character.only = TRUE)
  }
}
invisible(lapply(packages, install_and_load))

# Load data

setwd("FOLDER PATH")
load('df_ft_all_final.rda')

# OVERALL TABLE

num_rows <- nrow(df_ft_all) # overall total
table(df_ft_all$year) # year total
year_counts <- table(df_ft_all$year) # year total in table
mean_rows_per_year <- mean(year_counts) # mean
min_rows <- min(table(df_ft_all$year)) # min
max_rows <- max(table(df_ft_all$year)) # max
std_dev_rows_per_year <- sd(table(df_ft_all$year)) # standard deviation
overall_summary <- data.frame(
  "Party or sub group" = "Overall",
  "Number of rows" = num_rows,
  "Mean rows per year" = mean_rows_per_year,
  "Minimum rows of any year" = min_rows,
  "Maximum rows of any year" = max_rows,
  "Standard deviation of rows per year" = std_dev_rows_per_year
)

# Making overall table on party level

df_ft_all$party_grouped_descr <- ifelse(df_ft_all$party == 2, 2,
                                        ifelse(df_ft_all$party %in% c(3, 7, 8, 16), 3,
                                               ifelse(df_ft_all$party %in% c(4, 13, 14, 15), 4,
                                                      ifelse(df_ft_all$party %in% c(5), 5, 
                                                             ifelse(df_ft_all$party %in% c(999), 999, 
                                                                    ifelse(df_ft_all$party %in% c(99), 99, df_ft_all$party))
                                                      )
                                               )
                                        )
)

party_groups <- unique(df_ft_all$party_grouped_descr)
combined_summary <- data.frame()
for (party_group in party_groups) {
  # Subset the data for the current party_grouped
  subset_data <- df_ft_all[df_ft_all$party_grouped_descr == party_group, ]
  
  # Calculate statistics
  num_rows <- nrow(subset_data)
  year_counts <- table(subset_data$year)
  mean_rows_per_year <- mean(year_counts)
  min_rows <- min(year_counts)
  max_rows <- max(year_counts)
  std_dev_rows_per_year <- sd(year_counts)
  
  # Create a summary data frame for the current party_grouped
  summary_df <- data.frame(
    "Party or sub group" = party_group,
    "Number of rows" = num_rows,
    "Mean rows per year" = mean_rows_per_year,
    "Minimum rows of any year" = min_rows,
    "Maximum rows of any year" = max_rows,
    "Standard deviation of rows per year" = std_dev_rows_per_year
  )
  
  # Append the summary to the combined table
  combined_summary <- rbind(combined_summary, summary_df)
}

descr_overall <- rbind(overall_summary, combined_summary)
write.csv2(descr_overall,"FOLDER PATH", row.names = FALSE, fileEncoding = "UTF-8")

# TABLE OF PARTIES AND SPEECHES PER YEAR, FOR APPENDIX
df_ft_all$party_grouped_descr <- ifelse(df_ft_all$party == 2, 2,
                                        ifelse(df_ft_all$party %in% c(3, 7, 8, 16), 3,
                                               ifelse(df_ft_all$party %in% c(4, 13, 14, 15), 4,
                                                      ifelse(df_ft_all$party %in% c(5), 5, 
                                                             ifelse(df_ft_all$party %in% c(999), 999, 
                                                                    ifelse(df_ft_all$party %in% c(99), 99, df_ft_all$party))
                                                      )
                                               )
                                        )
)

party_year_counts <- df_ft_all %>%
  group_by(party_grouped_descr, year) %>%
  summarise(count = n())
summarized_df <- party_year_counts %>%
  group_by(year, party_grouped_descr) %>%
  summarize(total_count = sum(count))
summarized_df$year <- as.numeric(summarized_df$year) 
reshaped_df <- summarized_df %>% 
  pivot_wider(names_from = party_grouped_descr, values_from = total_count)
rownames(reshaped_df) <- reshaped_df$year
write.csv2(reshaped_df,"FOLDER PATH", row.names = FALSE, fileEncoding = "UTF-8")


# CONTROLS AND IVS

summary_table_iv <- df_ft_all %>%
  group_by(year) %>%
  summarize(
    arbejdsløshedsstøtte_sum = sum(arbejdsløshedsstøtte, na.rm = TRUE),
    arbejdsdag_sum = sum(arbejdsdag, na.rm = TRUE),
    dyrtid_sum = sum(dyrtid, na.rm = TRUE),
    merindkomstskat_sum = sum(merindkomstskat, na.rm = TRUE),
    repress_sum = sum(repress, na.rm = TRUE),
    revterm_sum = sum(socrevall, na.rm = TRUE)
  )

df_long <- summary_table_iv %>%
  pivot_longer(cols = c(arbejdsløshedsstøtte_sum, arbejdsdag_sum,
                        dyrtid_sum, merindkomstskat_sum, repress_sum, revterm_sum ), names_to = "variable")

summary_stats_table_iv <- df_long %>%
  group_by(variable) %>%
  summarize(
    Total = sum(value, na.rm = TRUE),
    Mean = mean(value, na.rm = TRUE),
    Minimum = min(value, na.rm = TRUE),
    Maximum = max(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE)
  )


# Controls

df_long <- df_ft_all %>%
  pivot_longer(cols = c(growth, unemp, inflation, n_tokens_all), names_to = "variable")

# Calculate summary statistics
summary_stats_table_controls <- df_long %>%
  group_by(variable) %>%
  summarize(
    Total = sum(value, na.rm = TRUE),
    Mean = mean(value, na.rm = TRUE),
    Minimum = min(value, na.rm = TRUE),
    Maximum = max(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE)
  )

summary_stats_all <- rbind(summary_stats_table_iv, summary_stats_table_controls)
write.csv2(summary_stats_all,"FOLDER PATH", row.names = FALSE, fileEncoding = "UTF-8")



# TOKENS PER YEAR
#Tokens per year, 1910-29
df_ft_all$year <- as.numeric(df_ft_all$year)
tokens_per_year <- df_ft_all %>%
  group_by(year) %>%
  summarise(total_tokens = sum(n_tokens))

#Average number of tokens per year, 1910-29
mean_tokens_per_year <- tokens_per_year %>%
  summarise(mean_tokens = mean(total_tokens))
mean_tokens_per_year

#Average number of tokens per year per row, 1910-29
average_tokens_per_row_per_year <- df_ft_all %>%
  group_by(year) %>%
  summarize(average_tokens = mean(n_tokens))
mean(average_tokens_per_row_per_year$average_tokens)

# FIGURE 2 - PARTIES AND SPEECHES PER YEAR

#Speeches per year per party
library(dplyr)
party_year_counts <- df_ft_all %>%
  group_by(party, year) %>%
  summarise(count = n())
party_year_counts_wide <- party_year_counts %>%
  pivot_wider(names_from = year, values_from = count, values_fill = 0)

grouped_party_df <- party_year_counts %>%
  mutate(
    grouped_party = case_when(
      party == 2 ~ "S",
      party %in% c(3, 7, 8, 16) ~ "K",
      party %in% c(4, 13, 14, 15) ~ "V",
      party == 5 ~ "R",
      TRUE ~ as.character(party) # Keep other parties as is
    )
  )
parties_to_plot <- c("S", "K", "V", "R")

filtered_df <- grouped_party_df %>%
  filter(grouped_party %in% parties_to_plot)
summarized_df <- filtered_df %>%
  group_by(year, grouped_party) %>%
  summarize(total_count = sum(count))
summarized_df$year <- as.numeric(summarized_df$year) 
summarized_df <- summarized_df %>%
  rename(Party = grouped_party)

party_colors_greyscale <- c("S" = "black", "R" = "grey", "K" = "darkgrey", "V" = "lightgrey")
party_linetypes_greyscale <- c("S" = "dotted", "R" = "dotdash", "K" = "twodash", "V" = "solid")
summarized_df$Party <- factor(summarized_df$Party, levels = c("V", "S", "K", "R"))

party_speeches_plot <- ggplot(summarized_df, aes(x = year, y = total_count, color = Party, linetype = Party)) +
  geom_line(size = 0.75) +  # Adjust the size as needed (e.g., 1.5 for thicker lines)
  scale_color_manual(values = party_colors_greyscale) +
  scale_linetype_manual(values = party_linetypes_greyscale) +
  labs(
    x = "Year",
    y = "Number of speeches") +
  scale_fill_discrete(breaks=c('K', 'V', 'R', 'S')) +
  theme_minimal()
party_speeches_plot
ggsave("FOLDER PATH", plot = party_speeches_plot, width = 5, height = 3)

#Most frequent speakers per year
unique_speakers_per_year <- df_ft_all %>%
  group_by(year) %>%
  distinct(speaker)

top_speakers_per_year <- df_ft_all %>%
  filter(party != 1) %>%
  group_by(year, speaker) %>%
  summarize(count = n()) %>%
  arrange(year, desc(count)) %>%
  group_by(year) %>%
  slice_head(n = 10) %>%
  ungroup()
mean_frequency_per_year <- top_speakers_per_year %>%
  group_by(year) %>%
  summarize(mean_frequency = mean(count))
top_speakers_per_year %>%
  summarize(mean_frequency = mean(count))
